Genome wide analyses

Load packages

Code
library(tidyverse) # tidy coding, ggplot etc
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.3     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.0
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Code
library(glue) # for coding within strings
library(bigsnpr) # to install:   devtools::install_github("privefl/bigsnpr")
Loading required package: bigstatsr
Code
library(pander) # for tables
library(future.apply) # for parallel code running
Loading required package: future
Code
library(brms) # for bayesian models
Loading required package: Rcpp
Loading 'brms' package (version 2.20.4). Useful instructions
can be found by typing help('brms'). A more detailed introduction
to the package is available through vignette('brms_overview').

Attaching package: 'brms'

The following object is masked from 'package:stats':

    ar
Code
library(ggtext) # for markdown syntax in ggplot
library(MetBrewer) # for more colour palettes
library(MoMAColors) # nicer colours once again
Registered S3 method overwritten by 'MoMAColors':
  method        from     
  print.palette MetBrewer
Code
library(patchwork) # building composite plots
library(DT) # for nice tables
library(kableExtra) # for more nice tables

Attaching package: 'kableExtra'

The following object is masked from 'package:dplyr':

    group_rows
Code
library(ggrepel) # for labelling ggplots
library(pheatmap) # for heat maps

# build a helper function that produces a table to display our data

# Create a function to build HTML searchable tables

my_data_table <- function(df){
  datatable(
    df, rownames=FALSE,
    autoHideNavigation = TRUE,
    extensions = c("Scroller",  "Buttons"),
    options = list(
      autoWidth = TRUE,
      dom = 'Bfrtip',
      deferRender=TRUE,
      scrollX=TRUE, scrollY=1000,
      scrollCollapse=TRUE,
      buttons =
        list('pageLength', 'colvis', 'csv', list(
          extend = 'pdf',
          pageSize = 'A4',
          orientation = 'landscape',
          filename = 'Lifespan_data')),
      pageLength = 100
    )
  )
}

Building the dataset

Here’s the raw data from each study. This was then passed here (part one) and later returned to conduct GWAS. The dataset has >185,000 rows (one row for each individual) so we don’t directly show it in this report. However, it can be downloaded from the data folder found in the github repository associated with this analysis.

Code
# Load genotyped lines reported in Huang et al for cross-matching and filtering

genotyped_lines <- read_csv("data/Input/Genotyped_lines.csv") %>% 
  mutate(line = as.factor(line),
         Genotyped = "YES")
Rows: 205 Columns: 1
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
dbl (1): line

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
# Load the lifespan datasets 

Arya_raw_data <- read_csv("data/Input/Raw_data/Arya_2010.csv") %>% 
  mutate(Sex = if_else(Sex == "F", "Female", "Male")) %>% 
  mutate(line = as.factor(line),
         Sex = as.factor(Sex),
         Vial_ID = as.factor(Vial_ID),
         Block = as.factor(Block),
         Diet = as.numeric(Diet),
         Adult_age_at_start = 2,
         Social_group_size = Density,
         Density = Density / 47,
         Death_day = Lifespan,
         Lifespan = Death_day + Adult_age_at_start) %>% 
  dplyr::select(line, Sex, Vial_ID, Lifespan, Adult_age_at_start, Death_day, Block, Study, Temperature, Diet, Social_group_size, Density, Mated, Prop_female)
Rows: 13438 Columns: 11
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): Sex, Study, Mated
dbl (7): line, Vial_ID, Lifespan, Block, Temperature, Density, Prop_female
lgl (1): Diet

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
Arya_lines <- Arya_raw_data %>% distinct(line) %>% inner_join(genotyped_lines) %>%  nrow() # differs from Ivanov because they collected a little bit more data in that study and combined with Ivanov. Those data were not emailed.
Joining with `by = join_by(line)`
Code
Huang_raw_data <- read_csv("data/Input/Raw_data/Huang_2020.csv") %>% 
  dplyr::select(-Vial_ID2) %>% 
  mutate(line = as.factor(Line),
         Sex = as.factor(Sex),
         Vial_ID = as.factor(Vial_ID),
         Block = as.factor(Block),
         Diet = as.numeric(Diet),
         Study = "Huang_2020",
         Adult_age_at_start = 1,
         Social_group_size = Density,
         Density = Density / 47,
         Death_day = Lifespan,
         Lifespan = Death_day + Adult_age_at_start) %>% 
  dplyr::select(line, Sex, Vial_ID, Lifespan, Adult_age_at_start, Death_day, Block, Study, Temperature, Diet, Social_group_size, Density, Mated, Prop_female)
Rows: 70209 Columns: 11
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): Sex, Vial_ID2, Mated
dbl (8): Line, Vial_ID, Lifespan, Block, Temperature, Prop_female, Density, ...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
Huang_lines <- Huang_raw_data %>% distinct(line) %>% inner_join(genotyped_lines) %>% nrow() # perfect match with study
Joining with `by = join_by(line)`
Code
Wilson_raw_data <- read_csv("data/Input/Raw_data/Wilson_2020.csv") %>% 
  rename(Vial_ID = Replicate,
         Block = Week) %>% 
  mutate(line = as.factor(Line),
         Sex = as.factor(Sex),
         Vial_ID = as.factor(Vial_ID),
         Block = as.factor(Block),
         Diet = as.numeric(Diet),
         Adult_age_at_start = 2,
         Social_group_size = Density,
         Density = Density / 47,
         Death_day = Lifespan,
         Lifespan = Death_day + Adult_age_at_start) %>% 
  dplyr::select(line, Sex, Vial_ID, Lifespan, Adult_age_at_start, Death_day, Block, Study, Temperature, Diet, Social_group_size, Density, Mated, Prop_female)
Rows: 67605 Columns: 11
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): Sex, Study, Mated
dbl (8): Line, Lifespan, Week, Replicate, Diet, Temperature, Prop_female, De...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
Wilson_lines <- Wilson_raw_data %>% distinct(line) %>% inner_join(genotyped_lines) %>%  nrow()
Joining with `by = join_by(line)`
Code
Durham_raw_data <- read_csv("data/Input/Raw_data/Durham_2014.csv") %>% 
  dplyr::select(1:4, 6, 14:18) %>% 
  rename(Vial_ID = Fly_ID) %>% 
  mutate(Sex = "Female") %>% 
  mutate(line = as.factor(Line),
         Sex = as.factor(Sex),
         Vial_ID = as.factor(Vial_ID),
         Block = as.factor(Block),
         Diet = as.numeric(Diet),
         Adult_age_at_start = 0,
         Social_group_size = Density,
         Density = Density / 47,
         Death_day = Lifespan,
         Lifespan = Death_day + Adult_age_at_start) %>% 
  dplyr::select(line, Sex, Vial_ID, Lifespan, Adult_age_at_start, Death_day, Block, Study, Temperature, Diet, Social_group_size, Density, Mated, Prop_female)
Rows: 4132 Columns: 18
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (8): Week_1_fecundity, Week_3_fecundity, Week_5_fecundity, Week_7_fecun...
dbl (10): Fly_ID, Block, Diet, Line, Thorax_length, Lifespan, Lifetime_fecun...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
Durham_lines <- Durham_raw_data %>% distinct(line) %>% anti_join(genotyped_lines) %>%  nrow()
Joining with `by = join_by(line)`
Code
Patel_raw_data <- read_csv("data/Input/Raw_data/Patel_2021.csv") %>% 
  left_join(read_csv("data/Input/Raw_data/Patel_2021.csv") %>% distinct(Line) %>% mutate(Vial_ID = 1:n())) %>% 
  mutate(Diet = 4,
         line = str_remove(Line, "DGRP "),
         line = as.factor(line),
         Sex = as.factor(Sex),
         Vial_ID = as.factor(Vial_ID),
         Block = as.factor(Block),
         Diet = as.numeric(Diet),
         Adult_age_at_start = 2,
         Social_group_size = Density,
         Density = Density / 47,
         Death_day = Lifespan,
         Lifespan = Death_day + Adult_age_at_start) %>% 
  dplyr::select(line, Sex, Vial_ID, Lifespan, Adult_age_at_start, Death_day, Block, Study, Temperature, Diet, Social_group_size, Density, Mated, Prop_female)
Rows: 3860 Columns: 9
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (4): Line, Study, Sex, Mated
dbl (5): Lifespan, Block, Temperature, Prop_female, Density

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Rows: 3860 Columns: 9
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (4): Line, Study, Sex, Mated
dbl (5): Lifespan, Block, Temperature, Prop_female, Density

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Joining with `by = join_by(Line)`
Code
# follow up

Patel_lines <- Patel_raw_data %>% distinct(line) %>% inner_join(genotyped_lines) %>% nrow()
Joining with `by = join_by(line)`
Code
Dick_raw_data <- read_csv("data/Input/Raw_data/Dick_2011_tidy.csv") %>% 
  dplyr::select(-Replicate) %>% 
  mutate(line = as.factor(Line),
         Sex = as.factor(Sex),
         Vial_ID = as.factor(Vial_ID),
         Block = as.factor(Block),
         Diet = as.numeric(Diet),
         Adult_age_at_start = 0,
         Social_group_size = Density,
         Density = Density / 150,
         Death_day = Lifespan,
         Lifespan = Death_day + Adult_age_at_start) %>% 
  dplyr::select(line, Sex, Vial_ID, Lifespan, Adult_age_at_start, Death_day, Block, Study, Temperature, Diet, Social_group_size, Density, Mated, Prop_female)
Rows: 15397 Columns: 12
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): Sex, Mated, Study
dbl (9): Line, Block, Lifespan, Replicate, Prop_female, Temperature, Diet, D...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
Dick_lines <- Dick_raw_data %>% distinct(line) %>% inner_join(genotyped_lines) %>% nrow()
Joining with `by = join_by(line)`
Code
Hoffman_raw_data <- read_csv("data/Input/Raw_data/tidy_Hoffman_2021.csv") %>% 
  rename(Vial_ID = Vial) %>% 
  mutate(Sex = if_else(Sex== "F", "Female", "Male")) %>% 
  mutate(line = as.factor(Line),
         Sex = as.factor(Sex),
         Vial_ID = as.factor(Vial_ID),
         Block = as.factor(Block),
         Diet = as.numeric(Diet),
         Adult_age_at_start = 5,
         Social_group_size = Density,
         Density = Density / 47,
         Death_day = Lifespan,
         Lifespan = Death_day + Adult_age_at_start) %>% 
  dplyr::select(line, Sex, Vial_ID, Lifespan, Adult_age_at_start, Death_day, Block, Study, Temperature, Diet, Social_group_size, Density, Mated, Prop_female)
Rows: 8478 Columns: 11
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): Sex, Study, Mated
dbl (8): Line, Lifespan, Vial, Block, Diet, Temperature, Prop_female, Density

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
Hoffman_lines <- Hoffman_raw_data %>% distinct(line) %>% inner_join(genotyped_lines) %>% nrow()
Joining with `by = join_by(line)`
Code
Zhao_raw_data <- read_csv("data/Input/Raw_data/tidy_Zhao_2022.csv") %>% 
  rename(Vial_ID = Vial) %>% 
  mutate(line = as.factor(Line),
         Sex = as.factor(Sex),
         Vial_ID = as.factor(Vial_ID),
         Block = as.factor(Block),
         Diet = as.numeric(10),
         Adult_age_at_start = 0, # they were 3 days old upon entrance, but this is already accounted for in the death day measure
         Social_group_size = Density,
         Density = Density / 47,
         Death_day = Lifespan,
         Lifespan = Death_day + Adult_age_at_start) %>% 
  dplyr::select(line, Sex, Vial_ID, Lifespan, Adult_age_at_start, Death_day, Block, Study, Temperature, Diet, Social_group_size, Density, Mated, Prop_female)
Rows: 2023 Columns: 11
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (4): Sex, Study, Mated, Diet
dbl (7): Block, Line, Lifespan, Vial, Temperature, Density, Prop_female

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
Zhao_lines <- Zhao_raw_data %>% distinct(line) %>% inner_join(genotyped_lines) %>% nrow()
Joining with `by = join_by(line)`
Code
# combine them into a single tibble

All_raw_data <- 
  bind_rows(Arya_raw_data,
            Huang_raw_data,
            Wilson_raw_data,
            Durham_raw_data,
            Patel_raw_data,
            Dick_raw_data,
            Hoffman_raw_data,
            Zhao_raw_data) %>% 
  filter(!is.na(Lifespan)) %>% 
  rename(Yeast_percentage = Diet) %>% 
  left_join(genotyped_lines, by = "line") %>% 
    mutate(Genotyped = if_else(is.na(Genotyped), "NO", Genotyped),
           Density = round(Density, 3))

# write_csv(All_raw_data, file ="data/Input/all_raw_data.csv")

#my_data_table(All_raw_data)

If you load the data, it will have the following columns

line: the DGRP line (genotype)

Sex: is the fly a female or a male?

Vial_ID: in many of the studies, multiple flies were kept together in the same environment. It is common practice to record which flies shared the same environment, because a common environment can explain why some flies might exhibit similar lifespan e.g. if humidity is slightly lower in one environment than another, an increased risk of desiccation is shared by all flies in that environment. It is called Vial_ID because the most common way to house Drosophila is in cylindrical vials.

Lifespan: the number of days the fly survived as an adult.

Adult_age_at_start: often, a few days elapsed before adult flies entered the lifespan assay. Thus, for comparison across studies, it is important to add these days to the Death_day count to find the true adult lifespan.

Death_day: the number of days the fly survived as an adult, that were tracked by the lifespan assay.

Block: because of the large number of DGRP lines, often studies split their experiment into separate blocks, each of which assays a fraction of the total number of lines. These blocks are temporally separated, and generally deal with different generations of flies. Note that blocks are rarely balanced with respect to line e.g. all the sampling of line 10 is done in block 1.

Study: the study that first reports the data.

Temperature: the temperature, in degrees Celsius, that the lifespan assay was completed at.

Yeast_percentage: the overall % of the food recipe made up by brewers yeast. This is a common ingredient in Drosophila food, and the key source of protein. Several studies in our dataset were specifically interested in restriction of protein availability as a method to extend lifespan.

Social_group_size: the number of flies housed together in a vial (or bottle), at the onset of the fitness assay. As individuals died, social group sizes decreased.

Density: the number of flies per cm^3 of available space. Calculated as Social_group_size / 47cm^3 for Drosophila vials and Social_group_size / 150cm^3 for Drosophila bottles.

Mated: across our identified studies, three mating states were used in the lifespan assays. Flies were either kept as virgins for their entire lives (Mated = NO), were allowed to mate for a short period prior to the lifespan assay (Mated = Prior to assay), or kept in mixed sex groups and thus allowed to mate throughout their lives.

Prop_female: the proportion of females kept within each environment.

Genotyped: indicates whether the DGRP line has sequence data available. Most do, but a few lines included in the early studies (pre-2012) did not end up getting genotyped.

Meta-analytic GWAS of life expectancy and lifespan equality

Load data

In part 1 of this study, we calculated mean values and standard error for each combination of line, sex, study, temperature and mating status. These data are displayed, and can be downloaded from, the below table. Note that for GWA and other SNP based analysis, we removed lines that had not been genotyped (shown as Genotyped = NO).

Code
full_dataset <- 
  read_csv("data/input/line_means.csv") %>% 
  mutate(line = as.factor(Line),) %>%
  dplyr::select(-Line) %>% 
  left_join(genotyped_lines, by = "line") %>% 
  mutate(Genotyped = if_else(is.na(Genotyped), "NO", Genotyped))
Rows: 2815 Columns: 17
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (5): Sex, Mated, Study, Density_cat, best_mod
dbl (12): Line, Temperature, Block, e0, SE_e0, h, SE_h, samp, a, b, a_SE, b_SE

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
my_data_table(full_dataset)

For CPASSOC, we split the data into their respective treatments. For the purposes of this analysis, we follow a common convention in quantitative genetics and consider each lifespan measurement a separate trait e.g. lifespan at 25 degrees is a separate trait from lifespan at 18 degrees.

Code
Arya_2010_f <-
full_dataset %>% 
  filter(Study == "Arya_2010" & Sex == "Female" & Genotyped == "YES")

Arya_2010_m <-
full_dataset %>% 
  filter(Study == "Arya_2010" & Sex == "Male" & Genotyped == "YES")

Huang_2020_f_18 <-
  full_dataset %>% 
  filter(Study == "Huang_2020" & Sex == "Female" & Temperature == 18)

Huang_2020_m_18 <-
  full_dataset %>% 
  filter(Study == "Huang_2020" & Sex == "Male" & Temperature == 18)

Huang_2020_f_25 <-
  full_dataset %>% 
  filter(Study == "Huang_2020" & Sex == "Female" & Temperature == 25)

Huang_2020_m_25 <-
  full_dataset %>% 
  filter(Study == "Huang_2020" & Sex == "Male" & Temperature == 25)

Huang_2020_f_28 <-
  full_dataset %>% 
  filter(Study == "Huang_2020" & Sex == "Female" & Temperature == 28)

Huang_2020_m_28 <-
  full_dataset %>% 
  filter(Study == "Huang_2020" & Sex == "Male" & Temperature == 28)

Wilson_2020_f <-
  full_dataset %>% 
  filter(Study == "Wilson_2020") %>% 
  distinct(line, .keep_all = TRUE)

Durham_2014_f <-
  full_dataset %>% 
  filter(Study == "Durham_2014") %>% 
  distinct(line, .keep_all = T)

Patel_2021_f <-
  full_dataset %>% 
  filter(Study == "Patel_2021")

We also load the residual line means for each sex, which are the line means after controlling for temperature, mating status and the Study_ID random effect.

Code
residual_means <- 
  read_csv("data/Input/MCMC_residuals.csv") %>% 
  mutate(line = as.factor(line)) %>% 
  inner_join(genotyped_lines) %>% 
  rename(e0 = res_e0,
         h = res_h)
Rows: 659 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): Sex
dbl (3): line, res_e0, res_h

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Joining with `by = join_by(line)`

Performing univariate GWAS

The preparation of data for GWAS is adapted from Holman and Wong’s DGRP GWAS of fitness in different contexts. See their associated workflowr report for a comprehensive breakdown of their analysis.

Install neccessary software and build helper functions

In addition to the R packages we load, plink 1.9 and beagle must also be installed. These software packages allow us to wrangle the data into the correct format and impute missing values, both of which are required to run GWAS with the Gemma R package.

Code
# build a function to prepare data for GWAS

prep_for_e0_GWAS <- function(data, sex){
data %>% 
  #inner_join(read_csv("data/Input/Genotyped_lines.csv"), by = "line") %>%  # filter to lines that have been genotyped
  mutate(line = glue("line{line}")) %>% 
  dplyr::select(line, e0)
}

prep_for_h_GWAS <- function(data, sex){
data %>% 
  #inner_join(read_csv("data/Input/Genotyped_lines.csv"), by = "line") %>%  # filter to lines that have been genotyped
  mutate(line = glue("line{line}")) %>% 
  dplyr::select(line, h)
}

prep_for_SA_GWAS <- function(data){
data %>% 
  #inner_join(read_csv("data/Input/Genotyped_lines.csv"), by = "line") %>%  # filter to lines that have been genotyped
  mutate(line = glue("line{line}")) %>% 
  dplyr::select(line, SA_axis)
}

# I used bigsnpr::download_plink(dir = "code") which puts it in the code folder - I'm using a windows operating system.

# Beagle is a software package for phasing genotypes and imputing ungenotyped markers.

plink <- paste(getwd(), "code/plink", sep = "/")
beagle <- bigsnpr::download_beagle()


# # Load the wolbachia infection status data from the DGRP website
# wolbachia <- read_csv("data/input/wolbachia.csv") %>% 
#   mutate(line = paste("line", line, sep = ""))
# 
# # Load the chomosomal inversion data from the DGRP website
# # these are the 5 inversions that Huang et al. PNAS corrected for
# inversions <- read_csv("data/input/inversion genotypes.csv") %>%
#     mutate(line = paste("line", line, sep = "")) %>%
#     select(line, `In(2L)t`, `In(2R)NS`, `In(3R)P`, `In(3R)K`, `In(3R)Mo`) 

# helper function to pass commands to the terminal
# Note that we set `intern = TRUE`, and pass the result of `system()` to `cat()`, 
# ensuring that the Terminal output will be printed in this knitr report.
# 
# This is the mac OS function
 
#run_command <- function(shell_command, wd = getwd(), path = ""){
#  cat(system(glue("cd ", wd, path, # tell terminal which directory to work in
#                  "\n",shell_command), # on a second terminal line, run the plink command
#             intern = TRUE), sep = '\n')  
#}

# This is the windows function 

run_command <- function(plink_command, wd = getwd(), path = "") {
  # Specify the full path to the plink executable within the 'code' subdirectory.
  plink_path <- paste(getwd(), "code/plink", sep = "/")
  
  # Create the full shell command with the plink executable.
  command <- glue("cmd.exe /c cd /d {shQuote(file.path(wd, path))} && {shQuote(plink_path)} {plink_command}")
  
  # Execute the combined command.
  result <- system(command, intern = TRUE)
  
  # Print the result.
  cat(result, sep = '\n')
  
  # Return the result as a character vector.
  return(result)
}

# sometimes we need to run terminal commands without plink - create a slightly different function to do this

run_command_no_plink <- function(shell_command, wd = getwd(), path = "") {
  
  # Create the full shell command with the plink executable.
  command <- glue("cmd.exe /c cd /d {shQuote(file.path(wd, path))} && {shell_command}")
  
  # Execute the combined command.
  result <- system(command, intern = TRUE)
  
  # Print the result.
  cat(result, sep = '\n')
  
  # Return the result as a character vector.
  return(result)
}

Build a function to create manhattan plots for later

Code
build_manhattan_plot <- function(gwas_results){
  
  manhattan_data <- gwas_results %>%
    mutate(position = str_split(SNP, "_"), # split the SNP name into logical bits
           chr = map_chr(position, ~ .x[1]), # the first bit is the chromosome arm - name the column appropriately
           position = as.numeric(map_chr(position, ~ .x[2])), # where on the chromosome is the SNP found
           pval = -1 * log10(P)) %>% # make visualising the p values easier
    dplyr::select(chr, position, pval)
  
  # this next chunk finds convenient cuts for labels and colour changes 
  
  max_pos <- manhattan_data %>%
    group_by(chr) %>%
    summarise(
      max_pos = max(position), 
      middle_pos = (max_pos - min(position)) / 2,
      .groups = "drop") %>%
    as.data.frame()
  
  max_pos$max_pos <- c(0, cumsum(max_pos$max_pos[1:5]))
  
  max_pos$label_pos <- max_pos$max_pos + max_pos$middle_pos
  
  # combine the two dataframes created above
  
  manhattan_data <- manhattan_data %>%
    left_join(max_pos, by = "chr") %>%
    mutate(position = position + max_pos,
           chromosome_colour = case_when(chr == "2L" | chr == "3L" | chr == "4" ~ "A",
                                         .default = "B"),
           Notable = if_else(pval >= 8, "YES", "NO"))
  
  plot <- manhattan_data %>%
    #filter(Notable == "NO") %>% 
    ggplot(aes(position, pval, group = chr, stroke = 0.01)) +
    geom_point(aes(colour = chromosome_colour), size = 1.6) +
    geom_hline(yintercept = 5, linetype = 2, colour = "red", linewidth = 1.2) +
    scale_colour_manual(values = c(met.brewer(name = "Hokusai3")[3], met.brewer(name = "Hokusai3")[6])) +
    #geom_label_repel(data = manhattan_data %>% filter(common_SNP == "YES" &),  
     #              aes(label = position, fill = "aliceblue")) +
    #geom_point(data = manhattan_data %>% filter(Notable == "YES"),
     #          fill = "aliceblue", size = 3, 
      #         shape = 21, colour = "grey2") +
    scale_x_continuous(breaks = max_pos$label_pos, labels = max_pos$chr) +
    scale_y_continuous(expand = c(0,0), limits = c(0, 20)) +
    ylab("-log~10~(_p_)") + 
    xlab("Chromosome and position") +
    theme_classic() +
    theme(legend.position = "none",
          axis.title.y = element_markdown(size = 18),
          axis.title.x = element_markdown(size = 18),
          axis.text.x = element_text(size = 15),
          axis.text.y = element_text(size = 15))  
}

Perform SNP quality control and impute missing data

We cleaned up the DGRP’s .bed/.bim/.fam files (available from the Mackay lab website) by:

  1. Removing any SNPs for which genotypes are missing for >10% of the DGRP lines. We then use the software Beagle to impute the remaining missing genotypes. Imputation takes a long time so ideally, you only want to do it once.

  2. Removing SNPs with a minor allele frequency of less than 5%. We have negilible power to detect associations for rare SNPs below this threshold.

In the PLINK-formatted genotype files, lines fixed for the major allele are coded as 2, and lines fixed for the minor allele as 0. This means that in the association tests we calculate, negative effect sizes mean that the minor allele is associated with lower trait values, while positive effect sizes means that the minor allele is associated with higher trait values.

Code
Run_function <- FALSE # Change this to TRUE to run the models

# If the imputation is not already done, create the following 3 files of imputed genotype data: 
# dgrp2_QC_all_lines_imputed_correct.bed/bim/fam
if(!file.exists("data/Derived/dgrp2_QC_all_lines_imputed_correct.bed")) {
  perform_SNP_QC_and_imputation(phenotypes = predicted_line_means)
}

if(Run_function){
  
  # Use Plink to clean and subset the DGRP's SNP data as follows:
  # Only keep SNPs for which at least 90% of DGRP lines were successfully genotyped (--geno 0.1)
  # Only keep SNPs with a minor allele frequency of 0.05 or higher (--maf 0.05)
  # Finally, write the processed BIM/BED/FAM files to the data/derived directory
  
  output_directory <-  paste(getwd(), "data/Derived", sep = "/")
  
  run_command(glue("--bfile dgrp2",
                   " --geno 0.1 --maf 0.05 --allow-no-sex", 
                   " --make-bed --out {shQuote(output_directory)}/dgrp2_QC_all_lines"), path = "data/Input/")
  
  # Use the shell command 'sed' to remove underscores from the DGRP line names in the .fam file (e.g. 'line_120' becomes 'line120')
  # Otherwise, these underscores cause trouble when we need to convert from PLINK to vcf format (vcf format uses underscore as a separator)
  # I manually edited the text file to do this rather than using the below command, which I don't have quite working
  #for(i in 1:2) run_command_no_plink("sed -i '' 's/_//' dgrp2_QC_all_lines.fam", path = "/data/Derived/")
                
  
  # Now impute the missing genotypes using Beagle
  # This part uses the data for the full DGRP panel of >200 lines, to infer missing genotypes as accurately as possible. 
  # This step uses a lot of memory (max memory was set to 28MB in prior analysis, and it used 26.5GB), but maybe it can also run on a less powerful computer?
  # The bigsnpr package provides a helpful wrapper for Beagle called snp_beagleImpute(): it translates to a VCF file and back again using PLINK
  snp_beagleImpute(beagle, plink, 
                   bedfile.in = "data/Derived/dgrp2_QC_all_lines.bed", 
                   bedfile.out = "data/Derived/dgrp2_QC_all_lines_imputed.bed",
                   ncores = 4, 
                   memory.max = 16)
  
  # assign a sex of 'female' to all the DGRP lines (Beagle removes the sex, and it seems PLINK needs individuals to have a sex, 
  # despite PLINK having a command called --allow-no-sex)
  run_command("sed -i '' 's/    0   0   0/  0   0   2/' dgrp2_QC_all_lines_imputed.fam", path = "/data/Derived/")
  
  # Re-write the .bed file, to make sure the MAF threshold and minor/major allele designations are correctly assigned post-Beagle
  run_command(glue("--bfile dgrp2_QC_all_lines_imputed",
                   " --geno 0.1 --maf 0.05 --allow-no-sex", 
                   " --make-bed --out dgrp2_QC_all_lines_imputed_correct"), path = "/data/Derived/")
  #unlink(list.files("data/derived", pattern = "~", full.names = TRUE)) # delete the original files, which were given a ~ file name by PLINK
} else "Already run"
[1] "Already run"

Build GWAS function

Note: because PLINK defines the minor allele as the alternative allele (lines fixed for the minor allele are scored as genotype: 2, and those with the major allele as genotype: 0), a positive effect size in these association tests means the minor allele is associated with a higher value of the trait in question.

Code
run_GWAS <- function(phenotypes){
  
  # Make a list of the lines in our sample and save as a text file for passing to PLINK
  lines_to_keep <- phenotypes %>% dplyr::select(line) %>% mutate(line_2 = line)
  write.table(lines_to_keep, 
              row.names = FALSE, 
              col.names = FALSE, 
              file = "data/Derived/lines_to_keep.txt", 
              quote = FALSE)

  # Now cull the PLINK files to just the lines that we measured, and re-apply the 
  # MAF cut-off of 0.05 for the new smaller sample of DGRP lines
  run_command(glue("--bfile dgrp2_QC_all_lines_imputed_correct",
                   " --keep-allele-order", # force PLINK to retain the major/minor allele designations that apply to the DGRP as a whole
                   " --keep lines_to_keep.txt --geno 0.1 --maf 0.05", 
                   " --make-bed --out dgrp2_QC_focal_lines"), path = "/data/Derived/")
  
    # Define a function to add our phenotype data to a .fam file, which is needed for GWAS analysis and to make sure PLINK includes these samples
  # The 'phenotypes' data frame needs to have a column called 'line'
  add_phenotypes_to_fam <- function(filepath, phenotypes){
    read_delim(filepath, col_names = FALSE, delim = " ") %>% 
      dplyr::select(X1, X2, X3, X4, X5) %>% # Get all the non-phenotype columns
      left_join(phenotypes, 
                by = c("X1" = "line")) %>%
      write.table(file = "data/Derived/dgrp2_QC_focal_lines_NEW.fam", 
                  col.names = FALSE, row.names = FALSE, 
                  quote = FALSE, sep = " ")
    
    unlink("data/Derived/dgrp2_QC_focal_lines.fam")
    file.rename("data/Derived/dgrp2_QC_focal_lines_NEW.fam", 
                "data/Derived/dgrp2_QC_focal_lines.fam")
  }
  
  # edit the new FAM file to add the phenotype data from 'phenotypes'
  add_phenotypes_to_fam("data/Derived/dgrp2_QC_focal_lines.fam", phenotypes)
  

  # Run mixed-model GWAS (in practice, the relatedness between almost all pairs of lines 
  # is sufficiently low that PLINK always instead chooses to run a linear model)
  
  run_command("--bfile dgrp2_QC_focal_lines  --assoc --maf 0.05", 
              path = "/data/Derived")
  
  # wrangle the GWAS results
  
  Focal_name <- deparse(substitute(phenotypes))
  
  gwas_results <- read.table("data/Derived/plink.qassoc", 
                             header = TRUE) %>% 
    dplyr::select(SNP, BETA, SE, "T", P)

  # Save a file containing all SNPs with P < 1e-5:
  gwas_results %>% 
    filter(P < 1e-05) %>% 
    write_tsv(glue("data/Derived/GWAS_results/{Focal_name}_significant_SNPs.tsv.gz"))

  # Rename and compress the GWAS summary stats file 
  # The filter step means that only variants in the LD-pruned subset get saved to disk.
  gwas_results %>% 
    filter(SNP %in% (pull(read_tsv("data/Derived/plink.prune.in", col_names = "SNP"), SNP))) %>% 
    write_tsv(glue("data/Derived/GWAS_results/{Focal_name}.tsv.gz"))
  unlink("data/Derived/plink.qassoc")
  
  # Rename the plink log file
  file.rename("data/Derived/plink.log",
              glue("data/Derived/{Focal_name}_log.txt"))
  
  unlink("data/Derived/dgrp2_QC_focal_lines.bim")
  unlink("data/Derived/dgrp2_QC_focal_lines.bed")
  unlink("data/Derived/dgrp2_QC_focal_lines.fam")
  unlink("data/Derived/dgrp2_QC_focal_lines.log")
} 

Run GWAS

Code
# if not already done, run the GWA tests

if(!file.exists("data/Derived/GWAS_results/Arya_f_l.tsv.gz")) {

Arya_f_l <- prep_for_e0_GWAS(Arya_2010_f)
Arya_m_l <- prep_for_e0_GWAS(Arya_2010_m)
Arya_f_h <- prep_for_h_GWAS(Arya_2010_f)
Arya_m_h <- prep_for_h_GWAS(Arya_2010_m)
Huang_f_18_l <- prep_for_e0_GWAS(Huang_2020_f_18)
Huang_f_18_h <- prep_for_h_GWAS(Huang_2020_f_18)
Huang_m_18_l <- prep_for_e0_GWAS(Huang_2020_m_18)
Huang_m_18_h <- prep_for_h_GWAS(Huang_2020_m_18)
Huang_f_25_l <- prep_for_e0_GWAS(Huang_2020_f_25)
Huang_f_25_h <- prep_for_h_GWAS(Huang_2020_f_25)
Huang_m_25_l <- prep_for_e0_GWAS(Huang_2020_m_25)
Huang_m_25_h <- prep_for_h_GWAS(Huang_2020_m_25)
Huang_f_28_l <- prep_for_e0_GWAS(Huang_2020_f_28)
Huang_f_28_h <- prep_for_h_GWAS(Huang_2020_f_28)
Huang_m_28_l <- prep_for_e0_GWAS(Huang_2020_m_28)
Huang_m_28_h <- prep_for_h_GWAS(Huang_2020_m_28)
Wilson_f_l <- prep_for_e0_GWAS(Wilson_2020_f)
Wilson_f_h <- prep_for_h_GWAS(Wilson_2020_f)
Durham_f_l <- prep_for_e0_GWAS(Durham_2014_f)
Durham_f_h <- prep_for_h_GWAS(Durham_2014_f)
Patel_f_l <- prep_for_e0_GWAS(Patel_2021_f)
Patel_f_h <- prep_for_h_GWAS(Patel_2021_f)

residual_f_l <- prep_for_e0_GWAS(residual_means %>% filter(Sex == "Female"))
residual_m_l <- prep_for_e0_GWAS(residual_means %>% filter(Sex == "Male"))
residual_f_h <- prep_for_h_GWAS(residual_means %>% filter(Sex == "Female"))
residual_m_h <- prep_for_h_GWAS(residual_means %>% filter(Sex == "Male"))

run_GWAS(Arya_f_l)
run_GWAS(Arya_m_l)
run_GWAS(Arya_f_h)
run_GWAS(Arya_m_h)
run_GWAS(Huang_f_18_l)
run_GWAS(Huang_f_18_h)
run_GWAS(Huang_m_18_l)
run_GWAS(Huang_m_18_h)
run_GWAS(Huang_f_25_l)
run_GWAS(Huang_f_25_h)
run_GWAS(Huang_m_25_l)
run_GWAS(Huang_m_25_h)
run_GWAS(Huang_f_28_l)
run_GWAS(Huang_f_28_h)
run_GWAS(Huang_m_28_l)
run_GWAS(Huang_m_28_h)
run_GWAS(Wilson_f_l)
run_GWAS(Wilson_f_h)
run_GWAS(Durham_f_l)
run_GWAS(Durham_f_h)
run_GWAS(Patel_f_l)
run_GWAS(Patel_f_h)

run_GWAS(residual_f_l)
run_GWAS(residual_m_l)
run_GWAS(residual_f_h)
run_GWAS(residual_m_h)
}

Add allele IDs and MAFs to annotation database

The annotation text file was downloaded from the DGRP website: .

Code
options(future.globals.maxSize = 2000 * 1024 ^ 2, 
        stringsAsFactors = FALSE)

# Helper function to split a vector into chunks 
chunker <- function(x, max_chunk_size) split(x, ceiling(seq_along(x) / max_chunk_size))

get_variant_annotations <- function(){
  
  # Load up the big annotation file, get pertinent info. It's stored in some sort of text string format
  annot <- read.table("data/Input/dgrp.fb557.annot.txt", header = FALSE, stringsAsFactors = FALSE)
  
  get.info <- function(rows){
    lapply(rows, function(row){
      site.class.field <- strsplit(annot$V3[row], split = "]")[[1]][1]
      num.genes <- str_count(site.class.field, ";") + 1
      output <- cbind(rep(annot$V1[row], num.genes), 
                      do.call("rbind", lapply(strsplit(site.class.field, split = ";")[[1]], 
                                              function(x) strsplit(x, split = "[|]")[[1]])))
      if(ncol(output) == 5) return(output[,c(1,2,4,5)]) # only return SNPs that have some annotation. Don't get the gene symbol
      else return(NULL)
    }) %>% do.call("rbind", .)
  }
  
  plan("multisession")
  variant.details <- future_lapply(chunker(1:nrow(annot), max_chunk_size = 10000), get.info) %>% 
    do.call("rbind", .) %>% as.data.frame()
  
  names(variant.details) <- c("SNP", "FBID", "site.class", "distance.to.gene")
  variant.details$FBID <- unlist(str_extract_all(variant.details$FBID, "FBgn[:digit:]+")) # clean up text strings for Flybase ID
  variant.details %>%
    dplyr::filter(site.class != "FBgn0003638") %>% # NB this is a bug in the DGRP's annotation file
    mutate(chr = str_remove_all(substr(SNP, 1, 2), "_")) # get chromosome now for faster sorting later
}

if(!file.exists("data/Derived/annotations.csv")){
get_variant_annotations() %>% 
write_csv("data/Derived/annotations.csv")
} else annotations <- read_csv("data/Derived/annotations.csv")
Rows: 4622971 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (4): SNP, FBID, site.class, chr
dbl (1): distance.to.gene

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
# Extract and save the MAFs, SNP positions, and major/minor alleles
MAFs <- read.table("data/derived/plink.frq", 
                   header = TRUE, stringsAsFactors = FALSE) %>% 
  mutate(position = map_chr(
    strsplit(SNP, split = "_"), 
    function(x) x[2])) %>%
  dplyr::select(SNP, position, MAF, A1, A2) %>% 
  rename(minor_allele = A1,
         major_allele = A2) 

MAFs <- annotations %>% 
  dplyr::select(SNP, FBID, site.class, distance.to.gene, chr) %>% collect() %>%
  full_join(MAFs, by = "SNP") %>%
  filter(!is.na(MAF)) %>%
  mutate(site.class = replace(site.class, is.na(site.class), "INTERGENIC")) %>% 
  as_tibble()

Applying cross-phenotype meta-analysis

The method

The power to detect variants associated with correlated phenotypes can be increased if a meta-analytic approach is adopted (Zhu et al, 2018). Here, we use the CPASSOC approach developed by Zhou et al (2015), which evaluates the null hypothesis that SNPs are associated with none of the considered traits, weighted by the sample size of each study and adjusted for the trait correlation matrix. The steps to apply CPASSOC are as follows:

  1. Estimate \(R\), the trait correlation matrix. In the DGRP, this can be done using SNP data or using line means.

  2. GWAS each trait separately (see above).

  3. Collate effect sizes for each trait into a vector \(\beta\), for each SNP.

  4. Use a Wald test statistic to estimate a vector of p-values, \(T\), from the \(\beta\) and \(\sigma\) (which is approximated from the study sample size) estimates for each SNP.

  5. Test whether \(\beta\) = 0. If the trait data are homogeneous (SNPs are expected to affect all traits in the same direction), use:

\[S_{Hom} = \frac{e^T(RW)^{-1}T(e^T(RW)^{-1}T)^T}{e^T(WRW)^{-1}e}\] where \(W\) is a diagonal matrix of weights for the individual test statistics (either the inverse of the variance or simply the sample size).

  1. If there is heterogeneity between trait measures (i.e. it is a reasonable expectation that SNPs could affect some traits in one direction and others in the opposing direction), \(S_{Hom}\) is not appropriate. The ideal test statistic in this case would only include the cohorts and traits with a true contribution to the association of a genetic variant. To implement this, the value \(\tau\) is used as a threshold, below which traits are not included in the test statistic. This statistic, \(S_{Het}\) can be viewed as the maximum of the weighted sum of trait-specific test statistics satisfying different thresholds. To calculate \(S_{Het}\) first find,

\[S_{\tau} = \frac{e^T(R(\tau)W(\tau))^{-1}T(\tau)(e^T(R(\tau)W(\tau))^{-1}T(\tau))^T}{e^TW(\tau)^{-1}R(\tau)^{-1}W(\tau)^{-1}e}\] When \(\tau\) is large, \(S(\tau)\) can be undefined if the test statistic falls below \(\tau\) for all cohorts. In this case \(S(\tau) = 0\). Our test statistic is then

\[\displaystyle S_{Het} = \max_{r \gt 0} S(\tau)\]

Anyway, I’ve only included that for the mathematically inclined. Code to implement both statistics in R can be downloaded here, and is implemented below.

Generate the genetic correlation matrix

Using line means

Code
female_data <-
  bind_rows(Arya_2010_f,
            Huang_2020_f_18,
            Huang_2020_f_25,
            Huang_2020_f_28,
            Wilson_2020_f,
            Durham_2014_f,
            Patel_2021_f) %>% 
  dplyr::select(line, Study, Sex, Temperature, e0, h) %>% 
  pivot_wider(values_from = c(e0, h), names_from = c(Study, Sex, Temperature)) 

female_data_e0 <-
  female_data %>% 
  dplyr::select(contains("e0"))

female_data_h <-
  female_data %>% 
  dplyr::select(!contains("e0"), -line)

female_e0_corr_matrix <- cor(female_data_e0, use = "pairwise.complete.obs", method = "spearman")
female_h_corr_matrix <- cor(female_data_h, use = "pairwise.complete.obs", method = "spearman")

male_data <-
  bind_rows(Arya_2010_m,
            Huang_2020_m_18,
            Huang_2020_m_25,
            Huang_2020_m_28) %>% 
  dplyr::select(line, Study, Sex, Temperature, e0, h) %>% 
  pivot_wider(values_from = c(e0, h), names_from = c(Study, Sex, Temperature)) 

male_data_e0 <-
  male_data %>% 
  dplyr::select(contains("e0"))

male_data_h <-
  male_data %>% 
  dplyr::select(!contains("e0"), -line)

male_e0_corr_matrix <- cor(male_data_e0, use = "pairwise.complete.obs", method = "spearman")
male_h_corr_matrix <- cor(male_data_h, use = "pairwise.complete.obs", method = "spearman")

all_data <- inner_join(female_data, male_data, by = "line")

all_data_e0 <- 
  all_data %>% 
  dplyr::select(contains("e0")) %>% 
  rename(Arya_f = e0_Arya_2010_Female_25,
         Huang_f_18 = e0_Huang_2020_Female_18,
         Huang_f_25 =  e0_Huang_2020_Female_25,
         Huang_f_28 = e0_Huang_2020_Female_28,
         Wilson_f = e0_Wilson_2020_Female_25,
         Durham_f = e0_Durham_2014_Female_25,
         Patel_f = e0_Patel_2021_Female_23,
         Arya_m = e0_Arya_2010_Male_25,
         Huang_m_18 = e0_Huang_2020_Male_18,
         Huang_m_25 = e0_Huang_2020_Male_25,
         Huang_m_28 = e0_Huang_2020_Male_28)

all_data_h <-
  all_data %>% 
  dplyr::select(!contains("e0"), -line) %>% 
  rename(Arya_f = h_Arya_2010_Female_25,
         Huang_f_18 = h_Huang_2020_Female_18,
         Huang_f_25 =  h_Huang_2020_Female_25,
         Huang_f_28 = h_Huang_2020_Female_28,
         Wilson_f = h_Wilson_2020_Female_25,
         Durham_f = h_Durham_2014_Female_25,
         Patel_f = h_Patel_2021_Female_23,
         Arya_m = h_Arya_2010_Male_25,
         Huang_m_18 = h_Huang_2020_Male_18,
         Huang_m_25 = h_Huang_2020_Male_25,
         Huang_m_28 = h_Huang_2020_Male_28)

all_e0_corr_matrix <- cor(all_data_e0, use = "pairwise.complete.obs", method = "spearman")
all_h_corr_matrix <- cor(all_data_h, use = "pairwise.complete.obs", method = "spearman")

Let’s visualise the correlations between lifespan measures. First for life expectancy:

Code
pheatmap(all_e0_corr_matrix, cutree_rows = 3, cutree_cols = 3)

Now for lifespan equality

Code
pheatmap(all_h_corr_matrix, cutree_rows = 3, cutree_cols = 3)

Using SNP effect sizes Not yet done

Code
#female_e0_SNP_corr_matrix <- 
 # cor(Female_e0_effect_sizes %>% dplyr::select(-SNP), use = "pairwise.complete.obs", method = "spearman")

Calculate meta-analytic test statistics

Note that the MASS package is required to run the functions we load below. Unfortunately this clashes with dplyrs select(). So be prepared to run detach("package:MASS", unload=TRUE) or use dplyr::select() to get some things to work if you’re moving back and forth throughout the code.

Code
source("FunctionSet.R") # saved as a separate file in the project. 

Attaching package: 'MASS'
The following object is masked from 'package:patchwork':

    area
The following object is masked from 'package:dplyr':

    select

Attaching package: 'Matrix'
The following objects are masked from 'package:tidyr':

    expand, pack, unpack
Loading required package: compiler

e0, sexes combined

The purpose of this meta-analysis is to search for SNPs that have some effect on ageing, expressed in many different contexts (sexes, temperatures and mating status’).

To conduct CPASSOC for a given SNP, we need a summary statistic from each GWAS. A different number of lines were included in each GWAS, which can cause small differences in the number of SNPs assessed. We therefore prune the list of SNPs to those included in all univariate analyses. After pruning (and previous pruning to remove SNPs in linkage disequilibrium), 220,582 SNPs remain.

Code
# load GWAS results

Arya_f_l_GWAS <- read_tsv("data/Derived/GWAS_results/Arya_f_l.tsv.gz") %>% 
  dplyr::select(SNP, T) %>% 
  rename(Arya_f = T)
Rows: 224164 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): SNP
dbl (4): BETA, SE, T, P

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
Huang_f_18_l_GWAS <- read_tsv("data/Derived/GWAS_results/Huang_f_18_l.tsv.gz") %>% 
  dplyr::select(SNP, T) %>% 
  rename(Huang_f_18 = T)
Rows: 225148 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): SNP
dbl (4): BETA, SE, T, P

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
Huang_f_25_l_GWAS <- read_tsv("data/Derived/GWAS_results/Huang_f_25_l.tsv.gz") %>% 
  dplyr::select(SNP, T) %>% 
  rename(Huang_f_25 = T)
Rows: 225593 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): SNP
dbl (4): BETA, SE, T, P

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
Huang_f_28_l_GWAS <- read_tsv("data/Derived/GWAS_results/Huang_f_28_l.tsv.gz") %>% 
  dplyr::select(SNP, T) %>% 
  rename(Huang_f_28 = T)
Rows: 225938 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): SNP
dbl (4): BETA, SE, T, P

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
Wilson_f_l_GWAS <- read_tsv("data/Derived/GWAS_results/Wilson_f_l.tsv.gz") %>% 
  dplyr::select(SNP, T) %>%  
  rename(Wilson_f_25  = T)
Rows: 223416 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): SNP
dbl (4): BETA, SE, T, P

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
Durham_f_l_GWAS <- read_tsv("data/Derived/GWAS_results/Durham_f_l.tsv.gz") %>% 
  dplyr::select(SNP, T) %>%  
  rename(Durham_f_25 = T)
Rows: 225819 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): SNP
dbl (4): BETA, SE, T, P

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
Patel_f_l_GWAS <- read_tsv("data/Derived/GWAS_results/Patel_f_l.tsv.gz") %>% 
  dplyr::select(SNP, T) %>%  
  rename(Patel_f_23 = T)
Rows: 226248 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): SNP
dbl (4): BETA, SE, T, P

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
Arya_m_l_GWAS <- read_tsv("data/Derived/GWAS_results/Arya_m_l.tsv.gz") %>% 
  dplyr::select(SNP, T) %>% 
  rename(Arya_m = T)
Rows: 224164 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): SNP
dbl (4): BETA, SE, T, P

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
Huang_m_18_l_GWAS <- read_tsv("data/Derived/GWAS_results/Huang_m_18_l.tsv.gz") %>% 
  dplyr::select(SNP, T) %>% 
  rename(Huang_m_18  = T)
Rows: 225148 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): SNP
dbl (4): BETA, SE, T, P

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
Huang_m_25_l_GWAS <- read_tsv("data/Derived/GWAS_results/Huang_m_25_l.tsv.gz") %>% 
  dplyr::select(SNP, T) %>% 
  rename(Huang_m_25 = T)
Rows: 225593 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): SNP
dbl (4): BETA, SE, T, P

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
Huang_m_28_l_GWAS <- read_tsv("data/Derived/GWAS_results/Huang_m_28_l.tsv.gz") %>% 
  dplyr::select(SNP, T) %>% 
  rename(Huang_m_28  = T)
Rows: 225938 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): SNP
dbl (4): BETA, SE, T, P

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
all_e0_effect_sizes <-
  Arya_f_l_GWAS %>%
  inner_join(Huang_f_18_l_GWAS, by = "SNP") %>%
  inner_join(Huang_f_25_l_GWAS, by = "SNP") %>%
  inner_join(Huang_f_28_l_GWAS, by = "SNP") %>% 
  inner_join(Wilson_f_l_GWAS, by = "SNP") %>% 
  inner_join(Durham_f_l_GWAS, by = "SNP") %>% 
  inner_join(Patel_f_l_GWAS, by = "SNP") %>% 
  inner_join(Arya_m_l_GWAS, by = "SNP") %>% 
  inner_join(Huang_m_18_l_GWAS, by = "SNP") %>% 
  inner_join(Huang_m_25_l_GWAS, by = "SNP") %>%
  inner_join(Huang_m_28_l_GWAS, by = "SNP")


all_e0_effect_sizes_values <-
  all_e0_effect_sizes %>% 
  dplyr::select(2:12)

Sample_size_all <- c(165, 165, 183, 183, 186, 186, 177, 177, 161, 189, 193) 

if(!file.exists("data/Derived/all_e0_meta_results.csv")) {

# run the homogeneous effect meta-analysis

S_hom <- SHom(all_e0_effect_sizes_values, Sample_size_all, all_e0_corr_matrix)

# calculate meta-p-values and bind the two together with the SNP names

p_S_hom <- pchisq(S_hom, df = 1, ncp = 0, lower.tail = F) %>% 
  as_tibble() %>% 
  bind_cols(S_hom) %>% 
  rename(meta_p_hom = value, 
         S_hom = ...2)

# Calculate S_het, an extension of S_hom that improves power when the genetic effect sizes vary for different traits (e.g. if a SNP has a sex or enviornment opposite effect on lifespan)

# estimate parameters of gamma distribution

para <- EstimateGamma(N = 1E4, Sample_size_all, all_e0_corr_matrix);

S_het <- SHet(all_e0_effect_sizes_values, Sample_size_all, all_e0_corr_matrix)

# obtain P-values of S_Het using the estimated gamma parameters
  
p_S_het <- pgamma(q = S_het-para[3], shape = para[1], scale = para[2], lower.tail = F) %>% 
  as_tibble() %>% 
  bind_cols(S_het) %>% 
  rename(meta_p_het = value, 
         S_het = ...2)


all_e0_meta_results <- 
  all_e0_effect_sizes %>% 
  bind_cols(p_S_hom,
            p_S_het) # add the unadjusted p values

write_csv(all_e0_meta_results, "data/Derived/all_e0_meta_results.csv")

} else all_e0_meta_results <- read_csv("data/Derived/all_e0_meta_results.csv")
Rows: 220582 Columns: 16
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (1): SNP
dbl (15): Arya_f, Huang_f_18, Huang_f_25, Huang_f_28, Wilson_f_25, Durham_f_...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

H sexes combined

Code
# load GWAS results

Arya_f_h_GWAS <- read_tsv("data/Derived/GWAS_results/Arya_f_h.tsv.gz") %>% 
  dplyr::select(SNP, T) %>% 
  rename(Arya_f = T)
Rows: 224164 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): SNP
dbl (4): BETA, SE, T, P

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
Huang_f_18_h_GWAS <- read_tsv("data/Derived/GWAS_results/Huang_f_18_h.tsv.gz") %>% 
  dplyr::select(SNP, T) %>% 
  rename(Huang_f_18 = T)
Rows: 225148 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): SNP
dbl (4): BETA, SE, T, P

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
Huang_f_25_h_GWAS <- read_tsv("data/Derived/GWAS_results/Huang_f_25_h.tsv.gz") %>% 
  dplyr::select(SNP, T) %>% 
  rename(Huang_f_25 = T)
Rows: 225593 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): SNP
dbl (4): BETA, SE, T, P

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
Huang_f_28_h_GWAS <- read_tsv("data/Derived/GWAS_results/Huang_f_28_h.tsv.gz") %>% 
  dplyr::select(SNP, T) %>% 
  rename(Huang_f_28 = T)
Rows: 225938 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): SNP
dbl (4): BETA, SE, T, P

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
Wilson_f_h_GWAS <- read_tsv("data/Derived/GWAS_results/Wilson_f_h.tsv.gz") %>% 
  dplyr::select(SNP, T) %>%  
  rename(Wilson_f_25  = T)
Rows: 223416 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): SNP
dbl (4): BETA, SE, T, P

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
Durham_f_h_GWAS <- read_tsv("data/Derived/GWAS_results/Durham_f_h.tsv.gz") %>% 
  dplyr::select(SNP, T) %>%  
  rename(Durham_f_25 = T)
Rows: 225819 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): SNP
dbl (4): BETA, SE, T, P

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
Patel_f_h_GWAS <- read_tsv("data/Derived/GWAS_results/Patel_f_h.tsv.gz") %>% 
  dplyr::select(SNP, T) %>%  
  rename(Patel_f_23 = T)
Rows: 226248 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): SNP
dbl (4): BETA, SE, T, P

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
Arya_m_h_GWAS <- read_tsv("data/Derived/GWAS_results/Arya_m_h.tsv.gz") %>% 
  dplyr::select(SNP, T) %>% 
  rename(Arya_m = T)
Rows: 224164 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): SNP
dbl (4): BETA, SE, T, P

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
Huang_m_18_h_GWAS <- read_tsv("data/Derived/GWAS_results/Huang_m_18_h.tsv.gz") %>% 
  dplyr::select(SNP, T) %>% 
  rename(Huang_m_18  = T)
Rows: 225148 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): SNP
dbl (4): BETA, SE, T, P

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
Huang_m_25_h_GWAS <- read_tsv("data/Derived/GWAS_results/Huang_m_25_h.tsv.gz") %>% 
  dplyr::select(SNP, T) %>% 
  rename(Huang_m_25 = T)
Rows: 225593 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): SNP
dbl (4): BETA, SE, T, P

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
Huang_m_28_h_GWAS <- read_tsv("data/Derived/GWAS_results/Huang_m_28_h.tsv.gz") %>% 
  dplyr::select(SNP, T) %>% 
  rename(Huang_m_28  = T)
Rows: 225938 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): SNP
dbl (4): BETA, SE, T, P

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
all_h_effect_sizes <-
  Arya_f_h_GWAS %>%
  inner_join(Huang_f_18_h_GWAS, by = "SNP") %>%
  inner_join(Huang_f_25_h_GWAS, by = "SNP") %>%
  inner_join(Huang_f_28_h_GWAS, by = "SNP") %>% 
  inner_join(Wilson_f_h_GWAS, by = "SNP") %>% 
  inner_join(Durham_f_h_GWAS, by = "SNP") %>% 
  inner_join(Patel_f_h_GWAS, by = "SNP") %>% 
  inner_join(Arya_m_h_GWAS, by = "SNP") %>% 
  inner_join(Huang_m_18_h_GWAS, by = "SNP") %>% 
  inner_join(Huang_m_25_h_GWAS, by = "SNP") %>%
  inner_join(Huang_m_28_h_GWAS, by = "SNP") 
  

all_h_effect_sizes_values <-
  all_h_effect_sizes %>% 
  dplyr::select(2:12)

if(!file.exists("data/Derived/all_h_meta_results.csv")) {

S_hom <- SHom(all_h_effect_sizes_values, Sample_size_all, all_h_corr_matrix)

# calculate meta-p-values and bind the two together with the SNP names

p_S_hom <- pchisq(S_hom, df = 1, ncp = 0, lower.tail = F) %>% 
  as_tibble() %>% 
  bind_cols(S_hom) %>% 
  rename(meta_p_hom = value, 
         S_hom = ...2)

# Calculate S_het, an extension of S_hom that improves power when the genetic effect sizes vary for different traits (probably not such a problem here given all traits we investigate are some version of life expectancy)

# estimate parameters of gamma distribution

para <- EstimateGamma(N = 1E4, Sample_size_all, all_h_corr_matrix);

S_het <- SHet(all_h_effect_sizes_values, Sample_size_all, all_h_corr_matrix)

# obtain P-values of S_Het using the estimated gamma parameters
  
p_S_het <- pgamma(q = S_het-para[3], shape = para[1], scale = para[2], lower.tail = F) %>% 
  as_tibble() %>% 
  bind_cols(S_het) %>% 
  rename(meta_p_het = value, 
         S_het = ...2)


all_h_meta_results <- 
  all_e0_effect_sizes %>% 
  bind_cols(p_S_hom,
            p_S_het) # add the unadjusted p values

write_csv(all_h_meta_results, "data/Derived/all_h_meta_results.csv")
} else all_h_meta_results <- read_csv("data/Derived/all_h_meta_results.csv")
Rows: 220582 Columns: 16
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (1): SNP
dbl (15): Arya_f, Huang_f_18, Huang_f_25, Huang_f_28, Wilson_f_25, Durham_f_...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Plot the results

We combine GWAS summary statistics calculated from lifespan data measured across different contexts. It’s possible that some SNPs have G x E interactions that would lead to a heterogeneous effect across phenotypes. We therefore plot the S_het calculated p-values.

Code
common_SNPs <- 
  all_e0_meta_results %>% filter(meta_p_het < 0.00005) %>% 
  inner_join(all_h_meta_results %>% filter(meta_p_het < 0.00005), by = "SNP") %>% 
  mutate(common_SNP = "YES") %>% 
  dplyr::select(SNP, common_SNP)

e0_results <- 
  all_e0_meta_results %>% 
  dplyr::select(SNP, meta_p_hom, meta_p_het) %>% 
  rename(P = meta_p_het) %>% 
  left_join(common_SNPs) %>% 
  mutate(common_SNP = if_else(is.na(common_SNP), "NO", common_SNP))
Joining with `by = join_by(SNP)`
Code
h_results <- 
  all_h_meta_results %>% 
  dplyr::select(SNP, meta_p_hom, meta_p_het) %>% 
  rename(P = meta_p_het) %>% 
  left_join(common_SNPs) %>% 
  mutate(common_SNP = if_else(is.na(common_SNP), "NO", common_SNP))
Joining with `by = join_by(SNP)`
Code
# plot the results using the manhattan plot custom function we defined earlier

e0_het_plot <- build_manhattan_plot(e0_results) +
  labs(title = "Life expectancy") +
  theme(plot.title = element_text(size = 20, hjust = 0.5))

h_het_plot <- build_manhattan_plot(h_results) +
  labs(title = "Lifespan equality") +
  theme(plot.title = element_text(size = 20, hjust = 0.5))


e0_het_plot + h_het_plot

GWA analysis using residual line means

Calculate genetic correlations

Using a hierarchical model, we previously partitioned the variance in life expectancy and lifespan equality caused by variation in:

  • Temperature

  • Mating status

  • Study conditions e.g. lab effects such as diet used.

  • Measurement error

With these sources of variation quantified, we can account for them and predict line means for both life expectancy and lifespan equality. These means represent very close approximations to the true across-context genotypic value of each line. Put another way, these estimates reflect the phenotypic product of the genotype that is not affected by the (measured) environment.

Code
residual_f_l_GWAS <- read_tsv("data/Derived/GWAS_results/residual_f_l.tsv.gz") 
Rows: 226770 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): SNP
dbl (4): BETA, SE, T, P

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
residual_m_l_GWAS <- read_tsv("data/Derived/GWAS_results/residual_m_l.tsv.gz") 
Rows: 226628 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): SNP
dbl (4): BETA, SE, T, P

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
residual_f_h_GWAS <- read_tsv("data/Derived/GWAS_results/residual_f_h.tsv.gz") 
Rows: 226770 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): SNP
dbl (4): BETA, SE, T, P

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
residual_m_h_GWAS <- read_tsv("data/Derived/GWAS_results/residual_m_h.tsv.gz") 
Rows: 226628 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): SNP
dbl (4): BETA, SE, T, P

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
residual_SNP_data <-
  residual_f_l_GWAS %>% 
  rename(Lifespan_female = BETA) %>% 
  #mutate(Sex = "Female") %>% 
  dplyr::select(SNP, Lifespan_female) %>% 
  left_join(
    residual_m_l_GWAS %>% 
      rename(Lifespan_male = BETA) %>% 
      #mutate(Sex = "Male") %>% 
      dplyr::select(SNP, Lifespan_male)) %>% 
  left_join(
    residual_f_h_GWAS %>% 
      rename(Equality_female = BETA) %>% 
      #mutate(Sex = "Female") %>% 
      dplyr::select(SNP, Equality_female)) %>% 
  left_join(
    residual_m_h_GWAS %>% 
      rename(Equality_male = BETA) %>% 
      #mutate(Sex = "Male") %>% 
      dplyr::select(SNP, Equality_male))
Joining with `by = join_by(SNP)`
Joining with `by = join_by(SNP)`
Joining with `by = join_by(SNP)`

Calculate the genetic correlation between life expectancy and lifespan equality, measured in both sexes.

Code
residual_SNP_data %>% 
  dplyr::select(-SNP) %>% 
  cor(use = "pairwise.complete.obs") %>% 
  kable(digits = 3) %>% kable_styling()
Lifespan_female Lifespan_male Equality_female Equality_male
Lifespan_female 1.000 0.720 -0.262 -0.188
Lifespan_male 0.720 1.000 -0.313 -0.243
Equality_female -0.262 -0.313 1.000 0.676
Equality_male -0.188 -0.243 0.676 1.000

There is a strong positive intersex-genetic correlation for both traits. However, and most intriguingly, lifespan has a negative genetic correlation with lifespan equality. Is this an artefact of how lifespan equality is calculated? I don’t think so, as we’ve previously seen positive correlations using the same metrics. What’s different is we’re now looking at the GWA beta coefficients calculated from the residual line mean values.

Lets plot the data to show these genetic correlations

Code
p1 <-
  residual_SNP_data %>% 
  ggplot(aes(x = Lifespan_female, y = Lifespan_male)) +
  geom_vline(xintercept = 0, linetype = 2, linewidth = 1) +
  geom_hline(yintercept = 0, linetype = 2, linewidth = 1) +
  stat_binhex(bins = 50) +
  scale_fill_moma_c("Exter", direction = -1) +
  #geom_point() +
  #stat_smooth(method = "lm", formula = y ~ x + I(x^2), linewidth = 0.75, colour = "#33CC66") +
  coord_cartesian(xlim = c(-5, 5), ylim = c(-5, 5)) +
  xlab("Effect on female life expectancy") + 
  ylab("Effect on male life expectancy") +
  theme_bw() +
  theme(legend.position = "none",
        text = element_text(size = 12))
p2 <-
  residual_SNP_data %>% 
  ggplot(aes(x = Equality_female, y = Equality_male)) +
  geom_vline(xintercept = 0, linetype = 2, linewidth = 1) +
  geom_hline(yintercept = 0, linetype = 2, linewidth = 1) +
  stat_binhex(bins = 50) +
  scale_fill_moma_c("Exter", direction = -1) +
  #geom_point() +
  #stat_smooth(method = "lm", formula = y ~ x + I(x^2), linewidth = 0.75, colour = "#33CC66") +
  coord_cartesian(xlim = c(-0.1, 0.1), ylim = c(-0.1, 0.1)) +
  xlab("Effect on female lifespan equality") + 
  ylab("Effect on male lifespan equality") +
  theme_bw() +
  theme(legend.position = "none",
        text = element_text(size = 12))
  
p3 <-
  residual_SNP_data %>% 
  ggplot(aes(x = Lifespan_female, y = Equality_female)) +
  geom_vline(xintercept = 0, linetype = 2, linewidth = 1) +
  geom_hline(yintercept = 0, linetype = 2, linewidth = 1) +
  stat_binhex(bins = 50) +
  scale_fill_moma_c("Exter", direction = -1) +
  coord_cartesian(xlim = c(-5, 5), ylim = c(-0.1, 0.1)) +
  xlab("Effect on female lifespan") + 
  ylab("Effect on female lifespan equality") +
  theme_bw() +
  theme(legend.position = "none",
        text = element_text(size = 12))

p4 <-
  residual_SNP_data %>% 
  ggplot(aes(x = Lifespan_male, y = Equality_male)) +
  geom_vline(xintercept = 0, linetype = 2, linewidth = 1) +
  geom_hline(yintercept = 0, linetype = 2, linewidth = 1) +
  stat_binhex(bins = 50) +
  scale_fill_moma_c("Exter", direction = -1) +
  coord_cartesian(xlim = c(-5, 5), ylim = c(-0.1, 0.1)) +
  xlab("Effect on male lifespan") + 
  ylab("Effect on male lifespan equality") +
  theme_bw() +
  theme(legend.position = "none",
        text = element_text(size = 12))

(p1 + p2) / (p3 + p4) + plot_annotation(tag_levels = "A")
Warning: Removed 142 rows containing non-finite values (`stat_binhex()`).
Removed 142 rows containing non-finite values (`stat_binhex()`).
Removed 142 rows containing non-finite values (`stat_binhex()`).

Analysis of sexual antagonism over lifespan

While there is a strong inter-sex genetic correlation for both life expectancy and lifespan equality, both correlations are significantly lower than one. Thus, it is likely that there are alleles that have sex-limited or sex-opposite effects on lifespan/lifespan equality. To identify the latter, we calculate a sexual antagonism index for each line, following previous studies focused on similar questions (e.g. Berger et al, 2014, Grieshop and Arnqvist, 2018, Ruzicka et al, 2020). To create the index, we rotate the coordinate system of the female and male fitness plane by 45 degrees, by multiplying the 204 x 2 fitness matrix by the rotation matrix, R. Figure XX A and B can help visualise this transformation

\[R = \]

Code
e0_matrix <-
  residual_means %>% 
  pivot_wider(names_from = "Sex", values_from = c(e0, h)) %>% 
  filter(!is.na(e0_Male)) %>% 
  dplyr::select(e0_Female, e0_Male) %>% 
  as.matrix()

h_matrix <- 
  residual_means %>% 
  pivot_wider(names_from = "Sex", values_from = c(e0, h)) %>% 
  filter(!is.na(h_Male)) %>% 
  dplyr::select(h_Female, h_Male) %>% 
  as.matrix()
  

rotation_matrix <- matrix(c(-1/sqrt(2), -1/sqrt(2), -1/sqrt(2), 1/sqrt(2)),
        nrow = 2)

e0_antagonism_index <- 
  residual_means %>% 
  pivot_wider(names_from = "Sex", values_from = c(e0, h)) %>% 
  filter(!is.na(e0_Male)) %>% 
  dplyr::select(line, e0_Female, e0_Male) %>% 
  bind_cols(
    e0_matrix %*% rotation_matrix %>% 
      as_tibble() %>% 
      rename(SC_axis  = V1,
             SA_axis = V2))
Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
`.name_repair` is omitted as of tibble 2.0.0.
ℹ Using compatibility `.name_repair`.
Code
h_antagonism_index <- 
  residual_means %>% 
  pivot_wider(names_from = "Sex", values_from = c(e0, h)) %>% 
  filter(!is.na(h_Male)) %>% 
  dplyr::select(line, h_Female, h_Male) %>% 
  bind_cols(
    h_matrix %*% rotation_matrix %>% 
      as_tibble() %>% 
      rename(SC_axis  = V1,
             SA_axis = V2))

p5 <-
  e0_antagonism_index %>% 
  ggplot(aes(x = e0_Female, y = e0_Male)) +
  geom_vline(xintercept = 0, linetype = 2, linewidth = 1) +
  geom_hline(yintercept = 0, linetype = 2, linewidth = 1) +
  geom_abline(intercept = 0, slope = -1, linewidth = 1, linetype = 2, colour = "salmon") +
  geom_abline(intercept = 0, slope = 1, linewidth = 1, linetype = 2, colour = "salmon") +
  geom_hline(yintercept = 0, linetype = 2, linewidth = 1) +
  geom_point(aes(fill = SA_axis), shape = 21, size = 5) +
  scale_fill_moma_c("Avedon", direction = -1) +
  coord_cartesian(xlim = c(-20, 20), ylim = c(-20, 20)) +
  labs(fill = "SA index",
       x = "Life expectancy (female)",
       y = "Life expectancy (male)") +
  theme_classic()

p6 <-
  h_antagonism_index %>% 
  ggplot(aes(x = h_Female, y = h_Male)) +
  geom_vline(xintercept = 0, linetype = 2, linewidth = 1) +
  geom_hline(yintercept = 0, linetype = 2, linewidth = 1) +
  geom_abline(intercept = 0, slope = -1, linewidth = 1, linetype = 2, colour = "salmon") +
  geom_abline(intercept = 0, slope = 1, linewidth = 1, linetype = 2, colour = "salmon") +
  geom_point(aes(fill = SA_axis), shape = 21, size = 5) +
  scale_fill_moma_c("Avedon", direction = -1) +
  coord_cartesian(xlim = c(-0.6, 0.6), ylim = c(-0.6, 0.6)) +
  labs(fill = "SA index",
       x = "Lifespan equality (female)",
       y = "Lifespan equality (male)") +
  theme_classic()

p5 / p6

Fig XX. Residual line means for female and male life expectancy. Points indicate a single line. The sexual antagonism index ranges from male-beneficial and female-detrimental (orange points) to female-beneficial and male-detrimental (green points). The red lines show the rotation that has been performed to create the antagonism and concordance axis. I don’t actually agree with the use of this metric completely. It finds departures from the perfect regression, which I guess probably are in part caused by SNPs with sex-opposite effects, but they don’t have to be!

Run GWAS

Code
SA_e0 <- prep_for_SA_GWAS(e0_antagonism_index)

SA_h <- prep_for_SA_GWAS(h_antagonism_index)


if(!file.exists("data/Derived/GWAS_results/SA_e0.tsv.gz")) {
run_GWAS(SA_e0)
run_GWAS(SA_h)
}

SA_e0_GWAS <- read_tsv("data/Derived/GWAS_results/SA_e0.tsv.gz") 
Rows: 226628 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): SNP
dbl (4): BETA, SE, T, P

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Code
SA_h_GWAS <- read_tsv("data/Derived/GWAS_results/SA_h.tsv.gz") 
Rows: 226628 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): SNP
dbl (4): BETA, SE, T, P

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Plot

Code
# plot the results using the manhattan plot custom function we defined earlier

SA_e0_manhattan_plot <- build_manhattan_plot(SA_e0_GWAS) +
  labs(title = "Life expectancy sexual antagonism") +
  theme(plot.title = element_text(size = 20, hjust = 0.5))

SA_h_manhattan_plot <- build_manhattan_plot(SA_e0_GWAS) +
  labs(title = "Lifespan equality sexual antagonism") +
  theme(plot.title = element_text(size = 20, hjust = 0.5))

SA_e0_manhattan_plot + SA_h_manhattan_plot + plot_annotation(tag_levels = "A")

Testing whether lifespan is a polygenic trait

Code
sim_data <- 
  tibble(draw_1 = rnorm(224164, 0, 1),
         draw_2 = rnorm(224164, 0, 1)) %>% 
   arrange(draw_1) %>%
  mutate(bin = c(rep(1:floor(n()/100), each = 100), 
                 rep(floor(n()/100) + 1, each = n() %% 100))) %>%
  group_by(bin) %>%
  summarise(draw_1 = mean(draw_1), draw_2 = mean(draw_2))

boyle_plot_sim <- 
  sim_data %>%
  ggplot(aes(draw_1, draw_2)) +
  geom_hline(yintercept = 0, linetype = 2) +
  geom_vline(xintercept = 0, linetype = 2) +
  geom_point(alpha = 0.8, size = 2.2) +
  stat_smooth(method = "lm", formula = y ~ x + I(x^2), linewidth = 0.75) +
  coord_cartesian(xlim = c(-4, 4), ylim = c(-4, 4)) +
  xlab("Mean effect size \n (random draw 1)") +
  ylab("Mean effect size \n (random draw 2)") + 
  theme_bw() +
  theme(strip.background = element_blank(),
        strip.text = element_text(hjust=0)) + 
  theme(text = element_text(size = 14))


real_data <-
  Female_GWAS %>% 
  dplyr::select(SNP, BETA) %>% 
  rename(beta_female = BETA) %>% 
  left_join(Male_GWAS %>% 
              dplyr::select(SNP, BETA) %>% 
              rename(beta_male = BETA),
            by = "SNP") %>% 
  filter(!is.na(beta_female) & !is.na(beta_male)) %>%  # remove NAs
  arrange(beta_female) %>%
  mutate(bin = c(rep(1:floor(n()/100), each = 100), 
                 rep(floor(n()/100) + 1, each = n() %% 100))) %>%
  group_by(bin) %>%
  summarise(females = mean(beta_female), males = mean(beta_male))

boyle_plot <- 
  real_data %>%
  ggplot(aes(females, males)) +
  geom_hline(yintercept = 0, linetype = 2) +
  geom_vline(xintercept = 0, linetype = 2) +
  geom_point(alpha = 0.8, size = 2.2) +
  stat_smooth(method = "lm", formula = y ~ x + I(x^2), linewidth = 0.75) +
  coord_cartesian(xlim = c(-4, 4), ylim = c(-4, 4)) +
  xlab("Mean effect size on female mean lifespan") +
  ylab("Mean effect size on male mean lifespan") + 
  theme_bw() +
  theme(strip.background = element_blank(),
        strip.text = element_text(hjust=0)) + 
  theme(text = element_text(size = 14))


boyle_plot_sim /
boyle_plot +
  plot_annotation(tag_levels = 'a')

Estimating the genetic correlation between mean lifespan and lifespan inequality

XX

Effect of mutational burden on lifespan

In this analysis we identify mutations that are likely to be deleterious, count the number of these putatively deleterious mutations per DGRP line, and test whether there is a negative association between mutational burden and lifespan.

The unguarded X

  • The homozygous nature of the DGRP makes this difficult.

  • But we can test if the X is purged of male-harming mutations more-so than female-harming mutations - that is, test the ‘faster X’ hypothesis.

Data preparation

We classified mutations as putatively deleterious if:

  • They are rare in the DGRP (<5% minor allele frequency)

  • They occur in a coding region

  • They have an effect that is likely to incur some functional change.

Following Holman and Wong (2023), we use the SnpEff designation of variants with major effects as those likely to incur a functional change. These variants are:

  • EXON_DELETED

  • NON_SYNONYMOUS_CODING

  • FRAME_SHIFT

  • CODON_CHANGE

  • CODON_INSERTION

  • CODON_CHANGE_PLUS_CODON_INSERTION

  • CODON_DELETION

  • CODON_CHANGE_PLUS_CODON_DELETION

  • STOP_GAINED

  • STOP_LOST

  • RARE_AMINO_ACID

  • START_LOST

  • START_GAINED

Code
# Load the DGRP genotype data using bigsnpr

if(!file.exists("snp_backing_file.bk")) {
rds <- snp_readBed("data/Input/dgrp2.bed", backingfile = "snp_backing_file")
}

bed_file <- snp_attach("snp_backing_file.rds")

annotations <- read.table("data/Input/dgrp.fb557.annot.txt", 
                    header = FALSE, stringsAsFactors = FALSE)

# Use PLINK to get the allele frequencies across the 205 DGRP lines

run_command("--bfile dgrp2  --freq", path = "/data/Input")

major_effect_types <- c(
  "EXON_DELETED", "NON_SYNONYMOUS_CODING",
  "FRAME_SHIFT", "CODON_CHANGE",
  "CODON_INSERTION",
  "CODON_CHANGE_PLUS_CODON_INSERTION",
  "CODON_DELETION",
  "CODON_CHANGE_PLUS_CODON_DELETION",
  "STOP_GAINED",
  "STOP_LOST",
  "RARE_AMINO_ACID",
  "START_LOST",
  "START_GAINED"
)

# this chunk detects and filters for any of the above variant types in the annotation dataset

all_major_mutations <- annotations$V1[unique(c(
  which(str_detect(annotations$V3, major_effect_types[1])),
  which(str_detect(annotations$V3, major_effect_types[2])),
  which(str_detect(annotations$V3, major_effect_types[3])),
  which(str_detect(annotations$V3, major_effect_types[4])),
  which(str_detect(annotations$V3, major_effect_types[5])),
  which(str_detect(annotations$V3, major_effect_types[6])),
  which(str_detect(annotations$V3, major_effect_types[7])),
  which(str_detect(annotations$V3, major_effect_types[8])),
  which(str_detect(annotations$V3, major_effect_types[9])),
  which(str_detect(annotations$V3, major_effect_types[10])),
  which(str_detect(annotations$V3, major_effect_types[11])),
  which(str_detect(annotations$V3, major_effect_types[12])),
  which(str_detect(annotations$V3, major_effect_types[13]))
))]

# now filter down to those with 0 < MAF < 0.05

rare_alleles <- read.table("data/Input/plink.frq", header = T) %>% 
  filter(MAF < 0.05 & MAF > 0) %>% pull(SNP)

# find alleles on the X

X_linked_alleles <- read.table("data/Input/plink.frq", header = T) %>% 
  filter(CHR == 5) %>% pull(SNP)

# and those on autosomes

autosomal_alleles <- read.table("data/Input/plink.frq", header = T) %>% 
  filter(CHR != 5) %>% pull(SNP)

# Get the indexes of variants that are major mutations and also have MAF < 0.05

indexes <- intersect(
  which(bed_file$map$marker.ID %in% all_major_mutations), 
  which(bed_file$map$marker.ID %in% rare_alleles)
)

# Get the indexes of variants that are major mutations and also have MAF < 0.05 and that are on the X

indexes_X <- intersect(
  indexes,
  which(bed_file$map$marker.ID %in% X_linked_alleles)
)

indexes_a <- intersect(
  indexes,
  which(bed_file$map$marker.ID %in% autosomal_alleles)
)

# Function to count the number of mutations in all 205 DGRP lines

count_mutations <- function(indexes){
  tibble(
    line = bed_file$fam$family.ID,
    mutation_count = sapply(
      1:205, function(i) sum(bed_file$genotypes[i, ][indexes] == 2, na.rm = T))
  ) %>% 
    mutate(line = str_remove(line, "_"))
}

mutational_burden <- count_mutations(indexes)

X_burden <- count_mutations(indexes_X)

autosomal_burden <- count_mutations(indexes_a)


# Integrate mutational burden with line lifespan data 

mutational_burden_line_data <-
  left_join(Arya_female_lifespan %>% rename(Female_lifespan = Lifespan),
            Arya_male_lifespan %>% rename(Male_lifespan = Lifespan)) %>% 
  left_join(mutational_burden,
            by = "line")

Run the bayesian linear regression

Code
mutational_burden_line_data <-
  mutational_burden_line_data %>% 
  mutate(Female_lifespan_standard = (Female_lifespan - mean(Female_lifespan))/ sd(Female_lifespan),
         Male_lifespan_standard = (Male_lifespan - mean(Male_lifespan))/ sd(Male_lifespan),
         mutation_count_100 = mutation_count/100)

# run this on individual data
individual_level_data <- 
  Arya_raw_data %>% dplyr::select(line, Lifespan, Sex) %>% 
  mutate(line = as.character(line),
         line = paste("line", line, sep = "")) %>%
  group_by(Sex) %>% 
  mutate(row = row_number()) %>%
  pivot_wider(names_from = Sex, values_from = Lifespan) %>% 
  dplyr::select(-row) %>% 
  right_join(mutational_burden, by = "line") %>% 
  rename(Female_lifespan = F, Male_lifespan = M) %>% 
  mutate(Female_lifespan_standard = 
           (Female_lifespan - mean(Female_lifespan, na.rm = T))/ sd(Female_lifespan, na.rm = T),
         Male_lifespan_standard = 
           (Male_lifespan - mean(Male_lifespan, na.rm = T))/ sd(Male_lifespan, na.rm = T),
         mutation_count_100 = mutation_count/100)

# raw data

individual_multivariate_mutational_burden_model <-
  brm(bf(
    mvbind(Female_lifespan, 
           Male_lifespan) ~ mutation_count_100 + (1|line)) + set_rescor(TRUE), 
    data = individual_level_data,
    family = gaussian,
    prior = c(prior(normal(60, 20), class = Intercept, resp = Femalelifespan),
              prior(normal(60, 20), class = Intercept, resp = Malelifespan),
              prior(normal(0, 2), class = b, resp = Femalelifespan),
              prior(normal(0, 2), class = b, resp = Malelifespan),
              prior(normal(1, 1), class = sigma, resp = Femalelifespan),
              prior(normal(1, 1), class = sigma, resp = Malelifespan),
              prior(lkj(2), class = rescor)),
    warmup = 2000, iter = 6000, chains = 4, cores = 4)

# means

multivariate_mutational_burden_model <-
  brm(bf(
    mvbind(Female_lifespan, 
           Male_lifespan) ~ mutation_count_100) + set_rescor(TRUE), 
    data = mutational_burden_line_data,
    family = gaussian,
    prior = c(prior(normal(60, 20), class = Intercept, resp = Femalelifespan),
              prior(normal(60, 20), class = Intercept, resp = Malelifespan),
              prior(normal(0, 2), class = b, resp = Femalelifespan),
              prior(normal(0, 2), class = b, resp = Malelifespan),
              prior(normal(1, 1), class = sigma, resp = Femalelifespan),
              prior(normal(1, 1), class = sigma, resp = Malelifespan),
              prior(lkj(2), class = rescor)),
    warmup = 2000, iter = 6000, chains = 4, cores = 4)


multivariate_mutational_burden_model

Plot results

Code
regression_lines <- conditional_effects(multivariate_mutational_burden_model, plot = F)

regression_lines <-
  bind_rows(
  regression_lines[[1]] %>% 
  as_tibble() %>% 
  dplyr::select(mutation_count_100, estimate__, se__, lower__, upper__) %>% 
  rename(Posterior_lifespan = estimate__, lower = lower__, upper = upper__) %>% 
  mutate(Sex = "Female"),

  regression_lines[[2]] %>% 
  as_tibble() %>% 
  dplyr::select(mutation_count_100, estimate__, se__, lower__, upper__) %>% 
  rename(Posterior_lifespan = estimate__, lower = lower__, upper = upper__) %>% 
  mutate(Sex= "Male")
)


mutational_burden_line_data %>%
  dplyr::select(2:3, mutation_count_100) %>% 
  pivot_longer(1:2, names_to = "Sex",
               values_to = "Posterior_lifespan") %>% 
  mutate(Sex = str_remove(Sex, "_lifespan")) %>% 
ggplot(aes(x = mutation_count_100,
           y = Posterior_lifespan)) +
  geom_point(size = 3) +
  geom_line(data = regression_lines, aes(y = Posterior_lifespan, x = mutation_count_100,
                                         colour = Sex), linewidth = 1.2) +
  geom_ribbon(data = regression_lines, aes(ymin = lower, ymax = upper,
                                           fill = Sex),
              alpha = 0.5) +
  scale_fill_manual(values = c(met.brewer("Hokusai3")[2], met.brewer("Hokusai3")[3])) +
  scale_colour_manual(values = c(met.brewer("Hokusai3")[2], met.brewer("Hokusai3")[3])) +
  facet_wrap(~Sex) +
  labs(x = "No. deleterious mutations (100s)",
       y = "Mean lifespan (days)") +
  theme_bigstatsr() +
  theme(legend.position = "none")